<img src="img/RNN-rolled.png"/ width="80px" height="80px">
<img src="img/RNN-unrolled.png"/ width="400px" height="400px">
<img src="img/LSTM3-chain.png"/ width="800px" height="800px">
In [ ]:
from keras.optimizers import SGD
from keras.preprocessing.text import one_hot,text_to_word_sequence,base_filter
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
In [ ]:
from sklearn.cross_validation import train_test_split
In [ ]:
import os
import pickle
import numpy as np
import re
In [ ]:
import pandas as pd
In [ ]:
DATA_DIRECTORY = os.path.join('../data')
print DATA_DIRECTORY
In [ ]:
with open(os.path.join(DATA_DIRECTORY,"male_blog_list.txt"),"rb") as male_file:
male_posts= pickle.load(male_file)
with open(os.path.join(DATA_DIRECTORY,"female_blog_list.txt"),"rb") as female_file:
female_posts = pickle.load(female_file)
In [ ]:
filtered_male_posts = []
filtered_female_posts = []
for post_male in male_posts:
if len(post_male) == 0:
continue
post_male = re.sub('\\n','',post_male)
filtered_male_posts.append(post_male)
for post_female in female_posts:
if len(post_female) == 0:
continue
post_female = re.sub('\\n','',post_female)
filtered_female_posts.append(post_female)
In [ ]:
all_posts = []
In [ ]:
all_posts.extend(filtered_male_posts)
all_posts.extend(filtered_female_posts)
In [ ]:
type(all_posts)
In [ ]:
all_posts[1]
In [ ]:
len(all_posts),len(filtered_male_posts),len(filtered_female_posts)
In [ ]:
# 0 for male, 1 for female
concatenate_array_rnn = np.concatenate((np.zeros(len(filtered_male_posts)),np.ones(len(filtered_female_posts))))
In [ ]:
char_list = list(set(''.join(all_posts)))
In [ ]:
char_indices = dict((c, i) for i, c in enumerate(char_list))
indices_char = dict((i, c) for i, c in enumerate(char_list))
In [ ]:
label_indices = {'male':0,'female':1}
indices_label = {0:'male',1:'female'}
In [ ]:
MAX_LENGTH = 0
i = 0
MAX_INDEX = 0
for i,n in enumerate(all_posts):
if len(n) > MAX_LENGTH:
MAX_LENGTH = len(n)
MAX_INDEX = i
print(MAX_LENGTH,MAX_INDEX)
In [ ]:
MAX_LENGTH = 5000
In [ ]:
def blog_to_char_seq(blog):
blog_chars = list(blog)
blog_chars_indices = list(map(lambda char: char_indices[char], blog_chars))
return sequence.pad_sequences([blog_chars_indices], maxlen=MAX_LENGTH)[0]
In [ ]:
X = []
y = []
for n, l in zip(all_posts, concatenate_array_rnn):
X.append(blog_to_char_seq(n))
y.append(l)
X = np.array(X).astype(np.uint8)
y = np.array(y)
print(X.shape, y.shape)
In [ ]:
y
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
In [ ]:
len(char_list)
In [ ]:
model = Sequential()
model.add(Embedding(len(char_list), 32, input_length=MAX_LENGTH, mask_zero=True))
model.add(LSTM(32, return_sequences=False))
model.add(Dropout(0.1))
model.add(Dense(1))
model.add(Activation('sigmoid'))
In [ ]:
model.compile(loss='binary_crossentropy',optimizer='adagrad', metrics=["accuracy"])
In [ ]:
model.fit(X_train,y_train,
batch_size=32,nb_epoch=2,
validation_split=0.1,
verbose=1)
In [ ]:
model.evaluate(X_test,y_test,batch_size=32)
In [ ]:
predicted_output = model.predict(X_test,batch_size=32)
predicted_classes = model.predict_classes(X_test, batch_size=32)
In [ ]:
df = pd.DataFrame(columns=['predicted','actual'])
In [ ]:
df['predicted_class'] = predicted_classes.flatten()
df['predicted'] = predicted_output.flatten()
In [ ]:
df['actual'] = y_test
In [ ]:
df.predicted_class.value_counts()
In [ ]:
df.actual.value_counts()
In [ ]: